# -*- coding: utf-8 -*-
import scrapy
import re

from scrapy.loader import ItemLoader
from ..items import Kpp77Item
class KppSpider(scrapy.Spider):
    name = 'kpp'
    allowed_domains = ['www.77kpp.com']
    start_urls = ['https://www.77kpp.com']

    def parse(self, response):
        # item_loader = ItemLoader(item=Kpp77Item(), response=response)
        # item_loader.add_css('title', 'li a::attr(title)')
        # item_loader.add_value('link', response.url)
        # #item_loader.add_css('link', 'li a::attr(href)')
        # yield scrapy.Request(response.url, callback=self.parse)
        # yield item_loader.load_item()

        sel = scrapy.Selector(response)
        links_in_a_page = sel.xpath('//a[@href]')  # 页面内的所有链接

        for link_sel in links_in_a_page:
            item = Kpp77Item()
            link = str(link_sel.re('href="(.*?)"')[0])  # 每一个url
            if link:
                if not link.startswith('https'):  # 处理相对URL
                    link = 'https://www.77kpp.com' + link
                yield scrapy.Request(link, callback=self.parse)  # 生成新的请求, 递归回调self.parse
                com_id = re.match(".*/(\d+)",link)
                link_text = link_sel.xpath('text()').extract()  # 每个url的链接文本, 若不存在设为None
                if link_text:
                    if com_id:
                        item['url'] = link
                        item['title'] = link_text[0]
                if item:
                    yield item
